given the following un-normalized posterior distribution
$ g(\theta|x) \propto \frac{1}{2} \exp\left(-\frac{(\theta + 3)^2}{2}\right) + \frac{1}{2} \exp\left(-\frac{(\theta - 3)^2}{2}\right) $
Please note that LaTeX is a typesetting language commonly used for mathematical expressions. It provides a more standardized and visually appealing representation of mathematical equations.
a) draw a Markov Chain from the posterior distribution using a Metropolis-Hastings algorithm
b) use a Norm (0, 1) as random-walk candidate density
c) plot the sampled distribution
d) analyze the chain with the CODA package and plot the chain autocorrelation
e) try to use different burn-in cycles and thinning and plot the corresponding posterior distribution
and the chain autocorrelation function. What are the best parameters ?
a,c) I draw a Markov Chain from the g function using a Metropolis-Hastings algorithm. The simulation starts with a burn-in phase. The n.burn.in refers to the number of initial iterations that are discarded before the samples are considered representative of the target distribution. The purpose of burn-in is to allow the MCMC chain to reach the equilibrium state.
After that phase the code proceeds to the main sampling loop where the Metropolis algorithm is applied to generate the desired number of samples. For each iteration in the algorithm, a new proposed value for theta is generated using a normal distribution with mean theta.cur and standard deviation sigma.
The Metropolis ratio is calculated as the base 10 logarithm of the ratio between the probability density at the new proposed point (func.Prop) and the probability density at the current point (func.Cur).
The Metropolis ratio is then compared to a random value drawn from a uniform distribution between 0 and 1. If the Metropolis ratio is greater than or equal to 0 or greater than the base 10 logarithm of the randomly drawn value, the new proposed point is accepted as the new current point; otherwise, the current point is retained.
library (coda)
Warning message: “il pacchetto ‘coda’ è stato creato con R versione 4.2.1”
#define the prior function
g <- function(theta) {
return(0.5 * exp(-((theta + 3)^2) / 2) + 0.5 * exp(-((theta - 3)^2) / 2))
}
# gets the log10 of g function
g.metropolis <- function (theta) {
return (log10(g(theta)))
}
metropolis_1dim <- function(func, theta.init, n.sample, n.burn.in, sigma, show = FALSE) {
theta.cur <- theta.init
func.Cur <- func(theta.cur)
func.Samp <- matrix(data = NA, nrow = n.sample, ncol = 2)
n.accept <- 0
for (burn in 1:n.burn.in) {
theta.prop <- rnorm(n = 1, mean = theta.cur, sigma) # proposed value for theta
func.Prop <- func(theta.prop)
logMR <- func.Prop - func.Cur
if (logMR >= 0 || logMR > log10(runif(1))) {
n.accept <- n.accept + 1
}
}
for (n in 1:n.sample) {
theta.prop <- rnorm(n = 1, mean = theta.cur, sigma) # proposed value for theta
func.Prop <- func(theta.prop)
logMR <- func.Prop - func.Cur
if (logMR >= 0 || logMR > log10(runif(1))) {
theta.cur <- theta.prop
func.Cur <- func.Prop
n.accept <- n.accept + 1
}
func.Samp[n, 1] <- func.Cur
func.Samp[n, 2] <- theta.cur
}
acceptance_rate <- 100 * n.accept / n.sample
if (show) {
print(paste("Acceptance Rate:", acceptance_rate, "%"))
}
return(func.Samp)
}
#set simulation parameters
theta.init <- -20
sample.sig <- 8
n.sample <- 10^6
n.burn.in <- 100
#run the MCMC simulation
set.seed(20190513)
chain <- metropolis_1dim(func= g.metropolis , theta.init = theta.init ,
n.sample = n.sample , n.burn.in=n.burn.in,sigma = sample.sig**2, show=FALSE)
#plot the g function
par( mfrow=c(1,2))
options(repr.plot.width = 15, repr.plot.height = 10)
x <- seq(-10, 10, length.out=10**4)
y <- g(x)
ymax <- 1.05 * max(y)
plot(x, y, ylim=c(0,max(y)*1.10), type='l', lwd=2, col='blue', main='Analytical', xlab= expression(theta), ylab = expression( paste ('f(',theta ,')', sep='')))
#plot the sampled points from g
#is required to do the 10**y to come back from the exponential form to the std one
Y <- chain[, 1]
X <- chain[, 2]
plot(X, 10**Y, type='p', pch='.', col='black', xlim=c(-10,10), ylim=c(0,max(y)*1.10),
main='MCMC', xlab= expression(theta), ylab = expression( paste ('f(',theta ,')', sep='')))
b) Try to use a Norm (0, 1) as random-walk candidate density.
The Metropolis MCMC function was rewritten by using a random sample, theta, drawn from a Gaussian distribution with a mean of 0 and a standard deviation of 1. The new approach differs from the previous method, which used a Gaussian distribution with a mean based on the current point and a user-defined standard deviation. The results of the two methods are not the same, in fact in the second case, the sampling is more concentrated around zero, while the tails of the function are left less explored.
metropolis_1dim_2 <- function(func, theta.init, n.sample, show = FALSE) {
theta.cur <- theta.init
func.Cur <- func(theta.cur)
func.Samp <- matrix(data = NA, nrow = n.sample, ncol = 2)
n.accept <- 0
for (n in 1:n.sample) {
theta.prop <- rnorm(n = 1, mean = 0, 1) # proposed value for theta
func.Prop <- func(theta.prop)
logMR <- func.Prop - func.Cur
if (logMR >= 0 || logMR > log10(runif(1))) {
theta.cur <- theta.prop
func.Cur <- func.Prop
n.accept <- n.accept + 1
}
func.Samp[n, 1] <- func.Cur
func.Samp[n, 2] <- theta.cur
}
acceptance_rate <- 100 * n.accept / n.sample
if (show) {
print(paste("Acceptance Rate:", acceptance_rate, "%"))
}
return(func.Samp)
}
#set simulation parameters
theta.init <- -20
n.sample <- 10^6
#run the MCMC simulation
set.seed(20190513)
chain <- metropolis_1dim_2(func= g.metropolis , theta.init = theta.init ,
n.sample = n.sample, show=FALSE)
#plot the g function
par( mfrow=c(1,2))
options(repr.plot.width = 15, repr.plot.height = 10)
x <- seq(-10, 10, length.out=10**4)
y <- g(x)
ymax <- 1.05 * max(y)
plot(x, y, ylim=c(0,max(y)*1.10), type='l', lwd=2, col='blue', main='Analytical', xlab= expression(theta), ylab = expression( paste ('f(',theta ,')', sep='')))
#plot the sampled points from g
Y2 <- chain[, 1]
X2 <- chain[, 2]
plot(X2, 10**Y2, type='p', pch='.', col='black', xlim=c(-10,10), ylim=c(0,max(y)*1.10), main='MCMC', xlab= expression(theta), ylab = expression( paste ('f(',theta ,')', sep='')))
d) analyze the chain with the CODA package and plot the chain autocorrelation.
I create a sequence of lags from 0 to 500 to represent the time intervals at which the autocorrelation will be computed. I compute the autocorrelation function (ACF) for the chain and I visualize the results in a scatter plot. Since the autocorrelation measures how closely the chain is correlated with itself h steps later, if the autocorrelation is small at higher lag times, it indicates that the current sample is not significantly influenced by its past samples, implying that the samples are approximately independent. This indicates that the chain has reached a convergence and the generated samples are representative of the target distribution.
chain1 <- as.mcmc(chain[,2])
my.lags = seq(0,500,2)
y1 <- autocorr(chain1, lags=my.lags)
plot(my.lags, y1, ylim=c(0, 1), pch=19, col='navy', xlab='lag', ylab='ACF', main = 'Chain autocorrelation',cex=1.5,cex.lab=1.5, cex.main=2)
text(400, 0.9, paste('sigma = 8'), cex=1.5)
text(400, 0.85, sprintf("effective size: %.2f", effectiveSize(chain1)), cex=1.5)
e) try to use different burn-in cycles and thinning and plot the corresponding posterior distribution and the chain autocorrelation function. What are the best parameters ?
The goal is to find the tradeoff between: 1) Burn in. The purpose of burn-in is to allow the MCMC chain to reach the equilibrium state. A longer burn-in period helps to ensure that the initial transient behavior does not affect the estimation of the target distribution.
2) Thinning. Thinning refers to the process of subsampling the chain at regular intervals to reduce autocorrelation between successive samples. It helps to improve the efficiency of the MCMC chain by reducing the number of highly correlated samples. A larger thinning interval reduces the autocorrelation but may result in a loss of information.
The best parameters for burn-in and thinning are those that minimize autocorrelation and maximize the effective sample size (namely the measure of the number of independent and informative samples obtained from the MCMC simulation). By visually inspecting the autocorrelation plots the best combination is burn-in =100 and thinning parameter=5. In fact in that case the chain appear to be uncorrelated at lag time =20 and the effective sample size is maximized.
n.burn.in.values <- c(50, 100, 150, 200) # Different burn-in values
thinning.values <- c(1, 2, 5, 10) # Different thinning values
par(mfrow = c(length(n.burn.in.values), length(thinning.values))) # Set up the layout for the plots
for (i in 1:length(n.burn.in.values)) {
n.burn.in <- n.burn.in.values[i]
for (j in 1:length(thinning.values)) {
interval <- thinning.values[j]
#simulation
n.sample <- 10^6
chain <- metropolis_1dim(func = g.metropolis, theta.init = theta.init,
n.sample = n.sample, n.burn.in = n.burn.in, sigma = sample.sig**2, show = FALSE)
#thinning
chain_thin <- chain[seq(1, nrow(chain), by = interval), ]
X_thin <- chain_thin[, 2]
Y_thin <- chain_thin[, 1]
#autocorrelation
chain1 <- as.mcmc(chain_thin[, 2])
my.lags <- seq(0, 100, 1)
y1 <- autocorr(chain1, lags = my.lags)
#plot autocorrelation
plot(my.lags, y1, ylim = c(0, 1), pch = 19, col = 'navy', xlab = 'lag', ylab = 'ACF',
main = paste("Burn-in:", n.burn.in, "Thinning:", interval), cex = 1.5, cex.lab = 1.5, cex.main = 2)
text(50, 0.45, sprintf("E. size: %.2f", effectiveSize(chain1)), cex = 1.5)
#plot sampled function
plot(X_thin, 10**Y_thin, type='p', pch='.', col='black', xlim=c(-10,10), ylim=c(0,max(y)*1.10), main = paste("Sampling: "," Burn-in:", n.burn.in, "Thinning:", interval), cex = 1.5, cex.lab = 1.5, cex.main = 2, xlab= expression(theta), ylab = expression( paste ('f(',theta ,')', sep='')))
}
}
The European Medicines Agency (EMA) has authorized a list of COVID-19 vaccines, after having performed a scientific evaluation of the veccines efficacy The following vaccines are currently authorized for use in the European Union:
Analyze the initial test data reported on the EMA Web site for the following early Vaccines
and create a Markow Chain Monte Carlo JAGS or stan the efficacy of each Vaccine. Infere the 95% credibility interval.
| Vaccine | N (Vax) | Cases (COVID-19) (Vax) | N (Placebo) | Cases (COVID-19) (Placebo) | Efficacy (%) |
|---|---|---|---|---|---|
| Vaxzevria | 5258 | 64 | 5210 | 154 | 74.0 |
| Janssen | 19630 | 116 | 19691 | 348 | 67.0 |
| moderna | 14134 | 11 | 14073 | 185 | 94.1 |
Initially I define some useful functions: organize data and get data. The first one takes four input parameters the output of the function get data) and performs the following steps:
The 'get data' function, given the 3 vaccines data, produces the following output when specifying a certain vaccine (V, J or M): number of vaccinated participants for each vaccine (tot_vaccine), number of COVID-19 cases among vaccinated for each vaccine (tot_placebo), number of participants in the placebo group for each vaccine (pos_vaccine) , and number of COVID-19 cases in the placebo group for each vaccine (pos_placebo).
Then a jags model is defined, using a Bernoulli distribution for the 'tested' variable. The model specifies that the 'tested' variable follows a Bernoulli distribution based on the theta parameter, which is estimated using a beta distribution with fixed parameters (3 and 100), where patient[i] indicates the group (placebo or vaccine) to which the ith observation belongs In the code, the first loop iterates over each observation (i) in the total number of observations (Ntot). It assigns a Bernoulli distribution to tested[i].
After defining the JAGS model, the following procedure is repeated for each vaccine:
Retrieve the data specific to that vaccine and organize it into a list format using the previously defined functions.
Run the JAGS model to estimate the infection rates for both the placebo and vaccine groups, represented by theta[1] and theta[2], respectively.
Calculate the percentage difference in infection rates between the two groups (= efficacy) by computing also the mean and the 95% confidence interval (CI). This provides an estimate of the relative effectiveness of the vaccine compared to the placebo in terms of reducing infection rates.
library(rjags)
library(coda)
require(runjags)
library (tidybayes)
library(dplyr)
library(tibble)
organize_data <- function(tot_vaccine, tot_placebo, pos_vaccine, pos_placebo) {
# Create patient (vaccine or placebo) and tested (pos or neg) vectors
patient <- c(rep("Vaccine", tot_vaccine), rep("Placebo", tot_placebo))
tested <- c(rep("Pos", pos_vaccine),
rep("Neg", tot_vaccine - pos_vaccine),
rep("Pos", pos_placebo),
rep("Neg", tot_placebo - pos_placebo))
# Create tibble and calculate frequency table
vax.tb <- tibble(tested = tested, patient = patient)
frequency_table <- table(vax.tb$patient, vax.tb$tested)
# Create dataList with binary values for patient and tested to be used in JAGS
# include also the total number of tested people
# and the number of possible values that the array patient can assume (2: vaccine or placebo in this ex)
dataList = list(
tested = ifelse(vax.tb$tested == "Neg", 0, 1),
patient = as.integer ( factor ( vax.tb$ patient )),
Ntot = nrow(vax.tb) ,
Nclass = nlevels ( factor ( vax.tb$ patient ))
)
return(list(frequency_table = frequency_table, vax.tb = vax.tb, patient=patient, tested=tested, dataList=dataList))
}
get_data <- function(vaccine_type) {
#all data for the 3 vaccines
n_vax <- c(5258, 19630, 14134) # Number of vaccinated participants for each vaccine
n_cases_vax <- c(64, 116, 11) # Number of COVID-19 cases among vaccinated for each vaccine
n_placebo <- c(5210, 19691, 14073) # Number of participants in the placebo group for each vaccine
n_cases_placebo <- c(154, 348, 185) # Number of COVID-19 cases in the placebo group for each vaccine
#assign a index to each vaccine (1=Vaxzeviria, 2=Jannsen, 3=moderna)
index <- match(vaccine_type, c("V", "J", "M"))
tot_vaccine <- n_vax[index]
tot_placebo <- n_placebo[index]
pos_vaccine <- n_cases_vax[index]
pos_placebo <- n_cases_placebo[index]
return(list(tot_vaccine = tot_vaccine,
tot_placebo = tot_placebo,
pos_vaccine = pos_vaccine,
pos_placebo = pos_placebo))
}
#define the jags model
#1st loop: binomial likelihood for the tested variable (can be only pos or neg)
#2nd loop: beta prior
modelString <- " model {
for ( i in 1:Ntot ) {
tested[i] ~ dbern(theta[ patient[i]])
}
for ( k in 1: Nclass ) {
theta[k] ~ dbeta (3 , 100)
}}"
#retrive data for vaxzeviria vaccine
Vaxeviria <- get_data("V")
tot_vaccine <- Vaxeviria$tot_vaccine
tot_placebo <- Vaxeviria$tot_placebo
pos_vaccine <- Vaxeviria$pos_vaccine
pos_placebo <- Vaxeviria$pos_placebo
#organize the data and put it in a suitable form for run jags
Vaxzeviria_organized <- organize_data(tot_vaccine, tot_placebo, pos_vaccine, pos_placebo)
dataList_V <- Vaxzeviria_organized$dataList
Vaxzeviria.tb <- Vaxzeviria_organized$vax.tb
Ntot = nrow( Vaxzeviria.tb)
Nclass = nlevels ( factor ( Vaxzeviria.tb$patient ))
#run the model
Vaxzeviria.chain <- run.jags( modelString , sample = 15000, n.chains = 4, method = "rjags", monitor = "theta", data = dataList_V )
Warning message: “No initial values were provided - JAGS will use the same initial values for all chains”
Compiling rjags model... Calling the simulation using the rjags method... Note: the model did not require adaptation Burning in the model for 4000 iterations... Running the model for 15000 iterations... Simulation complete Calculating summary statistics... Calculating the Gelman-Rubin statistic for 2 variables.... Finished running the simulation
summary ( Vaxzeviria.chain )
| Lower95 | Median | Upper95 | Mean | SD | Mode | MCerr | MC%ofSD | SSeff | AC.10 | psrf | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| theta[1] | 0.025098294 | 0.02949037 | 0.03416458 | 0.02954313 | 0.002318194 | NA | 1.159097e-05 | 0.5 | 40000 | -0.001165453 | 1.000056 |
| theta[2] | 0.009561192 | 0.01244180 | 0.01544542 | 0.01249741 | 0.001510944 | NA | 7.581778e-06 | 0.5 | 39715 | -0.001857742 | 1.000015 |
Vaxzeviria.chain.df <- as.data.frame ( as.mcmc( Vaxzeviria.chain ) )
placebo_V <- Vaxzeviria.chain.df[,1]
vaccine_V <- Vaxzeviria.chain.df[,2]
#Plot the estimated infection rates for both the placebo and vaccine groups
par(mfrow = c(1,2)) # Set up the layout for the plots
options(repr.plot.width = 15, repr.plot.height = 10)
hist(placebo_V, breaks=26, main = "Rate of infection: Placebo", xlab = "Rate of infection", col = "#993366", density=30)
segments(quantile(placebo_V,0.025), 0, quantile(placebo_V,0.975), 0, col = "black", lwd =4)
text(0.038, 7900, paste('Mean = ',round(mean(placebo_V),3)), col = "black")
hist(vaccine_V, breaks=18, main = "Rate of infection: vaccinated", xlab = "Rate of infection", col = "#33FF33", density=30)
segments(quantile(vaccine_V,0.025), 0, quantile(vaccine_V,0.975), 0, col = "black", lwd =4)
text(0.016, 12000, paste('Mean = ',round(mean(vaccine_V),3)), col = "black")
Warning message in as.mcmc.runjags(Vaxzeviria.chain): “Combining the 4 mcmc chains together”
#Calculate the percentage difference in infection rates between the two groups
Vaxzeviria_res <- tidybayes::tidy_draws(Vaxzeviria.chain) %>%
dplyr::select('theta[1]':'theta[2]') %>%
dplyr::rename(Placebo = 'theta[1]', Vaccine = 'theta[2]') %>%
dplyr::mutate(diff_rate = (Placebo - Vaccine) / Placebo * 100,
Placebo_perc = Placebo * 100,
Vaccine_perc = Vaccine * 100)
allmcmc2_V <- as.mcmc( Vaxzeviria_res ,"diff_rate")
source("DBDA2E-utilities.R")
options(repr.plot.width = 15, repr.plot.height = 10)
pt3 <- plotPost( allmcmc2_V[,"diff_rate"], col = "steelblue", main = "Bayesian Analysis of Vaxzeviria Vaccine Efficacy")
********************************************************************* Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition: A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier. *********************************************************************
#retrive data
Janssen <- get_data("J")
tot_vaccine <- Janssen$tot_vaccine
tot_placebo <- Janssen$tot_placebo
pos_vaccine <- Janssen$pos_vaccine
pos_placebo <- Janssen$pos_placebo
#organize data into a list
Janssen_organized <- organize_data(tot_vaccine, tot_placebo, pos_vaccine, pos_placebo)
dataList_J <- Janssen_organized$dataList
Janssen.tb <- Janssen_organized$vax.tb
Ntot = nrow( Janssen.tb)
Nclass = nlevels ( factor ( Janssen.tb$patient ))
#run the model to compute posteriors theta[1] and theta[2],
#representing the infection rates for the placebo and vaccine groups
Janssen.chain <- run.jags( modelString , sample = 15000, n.chains = 4, method = "rjags", monitor = "theta", data = dataList_J )
summary ( Janssen.chain )
Compiling rjags model... Calling the simulation using the rjags method... Note: the model did not require adaptation Burning in the model for 4000 iterations... Running the model for 15000 iterations... Simulation complete Calculating summary statistics... Calculating the Gelman-Rubin statistic for 2 variables.... Finished running the simulation
| Lower95 | Median | Upper95 | Mean | SD | Mode | MCerr | MC%ofSD | SSeff | AC.10 | psrf | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| theta[1] | 0.015933874 | 0.01772045 | 0.0195887 | 0.017733397 | 0.0009346769 | NA | 4.651377e-06 | 0.5 | 40379 | -0.0022014441 | 1.0000388 |
| theta[2] | 0.004986857 | 0.00602413 | 0.0071332 | 0.006040034 | 0.0005488271 | NA | 2.731956e-06 | 0.5 | 40357 | 0.0009299195 | 0.9999845 |
Janssen.chain.df <- as.data.frame ( as.mcmc( Janssen.chain ) )
placebo_J <- Janssen.chain.df[,1]
vaccine_J <- Janssen.chain.df[,2]
#Plot the estimated infection rates for both the placebo and vaccine groups
par(mfrow = c(1,2)) # Set up the layout for the plots
options(repr.plot.width = 15, repr.plot.height = 10)
hist(placebo_J, breaks=26, main = "Rate of infection: Placebo", xlab = "Rate of infection", col = "#993366", density=30)
segments(quantile(placebo_J,0.025), 0, quantile(placebo_J,0.975), 0, col = "black", lwd =4)
text(0.02, 11900, paste('Mean = ',round(mean(placebo_J),3)), col = "black")
options(repr.plot.width = 15, repr.plot.height = 10)
hist(vaccine_J, breaks=18, main = "Rate of infection: vaccinated", xlab = "Rate of infection", col = "#33FF33", density=30)
segments(quantile(vaccine_J,0.025), 0, quantile(vaccine_J,0.975), 0, col = "black", lwd =4)
text(0.008, 7900, paste('Mean = ',round(mean(vaccine_J),3)), col = "black")
Warning message in as.mcmc.runjags(Janssen.chain): “Combining the 4 mcmc chains together”
#Calculate the percentage difference in infection rates between the two groups
Janssen_res <- tidybayes::tidy_draws(Janssen.chain) %>%
dplyr::select('theta[1]':'theta[2]') %>%
dplyr::rename(Placebo = 'theta[1]', Vaccine = 'theta[2]') %>%
dplyr::mutate(diff_rate = (Placebo - Vaccine) / Placebo * 100,
Placebo_perc = Placebo * 100,
Vaccine_perc = Vaccine * 100)
allmcmc2_J <- as.mcmc( Janssen_res , vars="diff_rate")
source("DBDA2E-utilities.R")
options(repr.plot.width = 15, repr.plot.height = 10)
pt3 <- plotPost( allmcmc2_J[,"diff_rate"], col = "steelblue", main = "Bayesian Analysis of Jannsen Vaccine Efficacy")
********************************************************************* Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition: A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier. *********************************************************************
Moderna <- get_data("M")
tot_vaccine <- Moderna$tot_vaccine
tot_placebo <- Moderna$tot_placebo
pos_vaccine <- Moderna$pos_vaccine
pos_placebo <- Moderna$pos_placebo
Moderna_organized <- organize_data(tot_vaccine, tot_placebo, pos_vaccine, pos_placebo)
dataList_M<- Moderna_organized$dataList
Moderna.tb <- Moderna_organized$vax.tb
Ntot = nrow( Moderna.tb)
Nclass = nlevels ( factor ( Moderna.tb$patient ))
Moderna.chain <- run.jags( modelString , sample = 15000, n.chains = 4, method = "rjags", monitor = "theta", data = dataList_J )
summary ( Moderna.chain )
Compiling rjags model... Calling the simulation using the rjags method... Note: the model did not require adaptation Burning in the model for 4000 iterations... Running the model for 15000 iterations... Simulation complete Calculating summary statistics... Calculating the Gelman-Rubin statistic for 2 variables.... Finished running the simulation
| Lower95 | Median | Upper95 | Mean | SD | Mode | MCerr | MC%ofSD | SSeff | AC.10 | psrf | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| theta[1] | 0.015899391 | 0.017709395 | 0.019584712 | 0.017729495 | 0.0009390956 | NA | 4.743197e-06 | 0.5 | 39199 | -0.0008734398 | 1.000068 |
| theta[2] | 0.004958751 | 0.006009073 | 0.007107404 | 0.006026455 | 0.0005512494 | NA | 2.756247e-06 | 0.5 | 40000 | 0.0001390105 | 1.000049 |
Moderna.chain.df <- as.data.frame ( as.mcmc(Moderna.chain ) )
placebo_M <- Moderna.chain.df[,1]
vaccine_M <- Moderna.chain.df[,2]
#Plot the estimated infection rates for both the placebo and vaccine groups
par(mfrow = c(1,2)) # Set up the layout for the plots
hist(placebo_M, breaks=26, main = "Rate of infection: Placebo", xlab = "Rate of infection", col = "#993366", density=30)
segments(quantile(placebo_M,0.025), 0, quantile(placebo_M,0.975), 0, col = "black", lwd =4)
text(0.02, 11900, paste('Mean = ',round(mean(placebo_M),3)), col = "black")
hist(vaccine_M, breaks=18, main = "Rate of infection: vaccinated", xlab = "Rate of infection", col = "#33FF33", density=30)
segments(quantile(vaccine_M,0.025), 0, quantile(vaccine_M,0.975), 0, col = "black", lwd =4)
text(0.008, 7900, paste('Mean = ',round(mean(vaccine_M),3)), col = "black")
Warning message in as.mcmc.runjags(Moderna.chain): “Combining the 4 mcmc chains together”
#Calculate the percentage difference in infection rates between the two groups
Moderna_res <- tidybayes::tidy_draws(Moderna.chain) %>%
dplyr::select('theta[1]':'theta[2]') %>%
dplyr::rename(Placebo = 'theta[1]', Vaccine = 'theta[2]') %>%
dplyr::mutate(diff_rate = (Placebo - Vaccine) / Placebo * 100,
Placebo_perc = Placebo * 100,
Vaccine_perc = Vaccine * 100)
allmcmc2_M <- as.mcmc(Moderna_res , vars="diff_rate")
source("DBDA2E-utilities.R")
options(repr.plot.width = 15, repr.plot.height = 10)
pt3 <- plotPost( allmcmc2_M[,"diff_rate"],col = "steelblue", main = "Bayesian Analysis of Moderna Vaccine Efficacy")
********************************************************************* Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition: A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier. *********************************************************************
• according to the official COVID-19 vaccination data, 70% of the world population has received at least one dose of a COVID-19 vaccine. A global vaccination dataset is available [5] • the European Centre for Disease Prevention and Control published a downloadable file [6] containing information on COVID-19 vaccination in the EU/EEA. • analyze the data and produce the following plots:
library(dplyr)
library(lubridate)
library(ggplot2)
library(patchwork)
# Set the file path
vaccinations_path <- "vaccinations.csv"
# Load the CSV file
vaccinations <- read.csv(vaccinations_path)
# Define a vector of ISO codes for European countries
european_countries = c("Albania", "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czech Republic",
"Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland",
"Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", "Netherlands",
"Norway", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden",
"Switzerland", "United Kingdom")
# Filter the data to select only the countries in Europe
vaccinations_eu <- vaccinations %>% filter(location %in% european_countries)
vaccinations_eu <- vaccinations_eu %>% filter (!is.na(people_vaccinated)) #delete the NA
#Calculate the day of the year for each date
vaccinations_eu_day <- vaccinations_eu
vaccinations_eu_day$date <- as.Date(vaccinations_eu$date, format = "%Y-%m-%d")
#group by day of the year and sum the vaccinated people for each day
vaccinations_eu_day <- vaccinations_eu_day%>% group_by( date) %>% summarise(total_vaccinated=sum(people_vaccinated))
#plot
options(repr.plot.width = 15, repr.plot.height = 15)
plot1 <- ggplot(vaccinations_eu_day, aes(x = date, y = total_vaccinated)) +
geom_line(color = 'blue') +
geom_point(color = 'blue') +
labs(x = "Day of the year", y = "Total vaccinated", title = "Total vaccinated per day", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot1
#Calculate the week of the year for each date
vaccinations_eu_week <- vaccinations_eu
vaccinations_eu_week$week_of_year <- week(vaccinations_eu$date)
vaccinations_eu_week$year <- year(vaccinations_eu$date)
#group by week of the year and average the vaccinated people for each week
vaccinations_eu_week <- vaccinations_eu_week%>% group_by( week_of_year, year) %>% summarise(avg_vaccinated=sum(people_vaccinated)/7)
vaccinations_eu_week[1:4,]
#divide by year
vaccinations_eu_week_2021 <- vaccinations_eu_week%>% filter(year == 2021)
vaccinations_eu_week_2022 <- vaccinations_eu_week %>% filter(year == 2022)
vaccinations_eu_week_2023 <- vaccinations_eu_week %>% filter(year == 2023)
#plot
options(repr.plot.width = 15, repr.plot.height = 15)
plot2a <- ggplot(vaccinations_eu_week_2021, aes(x = week_of_year, y = avg_vaccinated)) +
geom_ribbon(aes(ymin = 0, ymax = avg_vaccinated), fill = "blue", alpha = 0.2) + #ribbon for filling
geom_line(color = 'steelblue') +
geom_point(color = 'blue') +
labs(x = "Week of the year", y = "Mean vaccinated", title = "Mean number of vaccinated per week in 2021", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
options(repr.plot.width = 15, repr.plot.height = 15)
plot2b <- ggplot(vaccinations_eu_week_2022, aes(x = week_of_year, y = avg_vaccinated)) +
geom_ribbon(aes(ymin = 0, ymax = avg_vaccinated), fill = "blue", alpha = 0.2) +
geom_line(color = 'steelblue') +
geom_point(color = 'blue') +
labs(x = "Week of the year", y = "Mean vaccinated", title = "Mean number of vaccinated per week in 2022", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
options(repr.plot.width = 15, repr.plot.height = 15)
plot2c <- ggplot(vaccinations_eu_week_2023, aes(x = week_of_year, y = avg_vaccinated)) +
geom_ribbon(aes(ymin = 0, ymax = avg_vaccinated), fill = "blue", alpha = 0.2) +
geom_line(color = 'steelblue') +
geom_point(color = 'blue') +
labs(x = "Week of the year", y = "Mean vaccinated", title = "Mean number of vaccinated per week in 2023", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
# Create a grid of plots using the patchwork package
plot <- (plot2a) / (plot2b) / (plot2c)
plot
`summarise()` has grouped output by 'week_of_year'. You can override using the `.groups` argument.
| week_of_year | year | avg_vaccinated |
|---|---|---|
| <dbl> | <dbl> | <dbl> |
| 1 | 2021 | 888817.6 |
| 1 | 2022 | 311229988.3 |
| 1 | 2023 | 230266299.1 |
| 2 | 2021 | 4831346.7 |
#compute the cumulative number of vaccinated people per day
vaccinations_eu_cum <- vaccinations_eu_day
vaccinations_eu_cum$total_vaccinated_cum <- cumsum(vaccinations_eu_day$total_vaccinated)
#vaccinations_eu_cum
#plot
options(repr.plot.width = 15, repr.plot.height = 15)
plot2 <- ggplot(vaccinations_eu_cum, aes( x = date , y = total_vaccinated_cum )) +
geom_line(color='red') +
geom_ribbon(aes(ymin = 0, ymax = total_vaccinated_cum), fill = 'red', alpha = 0.2) +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))+
geom_point(color='red') +
labs(x = "Day of the year", y= "Total cumulative vaccinated" ,title = "Total cumulative vaccinated per day", color = "")
plot2
b) number of confirmed deaths by COVID-19, both cumulative (and dayly) and weekly average
# Set the file path
new_deaths_path <- "new_deaths.csv"
# Load the CSV file
new_deaths <- read.csv(new_deaths_path)
# Select columns for EU countries
eu_countries <- c("Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia",
"Finland", "France", "Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia",
"Lithuania", "Luxembourg", "Malta", "Netherlands", "Poland", "Portugal", "Romania",
"Slovakia", "Slovenia", "Spain", "Sweden")
# Filter the data to select only the countries in Europe
new_deaths_eu <- new_deaths %>% select(date, all_of(eu_countries))
#delete the NA
new_deaths_eu <- na.omit(new_deaths_eu)
# Convert date column to Date type
new_deaths_eu$date <- as.Date(new_deaths_eu$date)
#compute the total number of deaths per day
new_deaths_eu_sum <- new_deaths_eu %>% group_by( date) %>% summarise(deaths=sum(across(matches(paste(eu_countries, collapse = "|")))))
new_deaths_eu_sum[60:64,] #show some exemples
new_deaths_eu_cum <- new_deaths_eu_sum
new_deaths_eu_cum$death_cum <- cumsum(new_deaths_eu_sum$deaths)
new_deaths_eu_cum[60:64,]
#plot
options(repr.plot.width = 15, repr.plot.height = 15)
plotd <- ggplot(new_deaths_eu_sum, aes(x = date, y = deaths)) +
geom_line(color = 'navy') +
geom_point(color = 'blue') +
labs(x = "day of the year", y = "Deaths", title = "Total number of deaths per day", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plotd
ggplot(new_deaths_eu_cum, aes(x = date, y = death_cum)) +
geom_line(color = 'navy') +
geom_point(color = 'blue') +
geom_ribbon(aes(ymin = 0, ymax = death_cum), fill = 'blue', alpha = 0.2) +
labs(x = "day of the year", y = "Deaths", title = "Cumulative number of deaths per day", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
| date | deaths |
|---|---|
| <date> | <dbl> |
| 2020-03-02 | 5 |
| 2020-03-03 | 19 |
| 2020-03-04 | 29 |
| 2020-03-05 | 31 |
| 2020-03-06 | 47 |
| date | deaths | death_cum |
|---|---|---|
| <date> | <dbl> | <dbl> |
| 2020-03-02 | 5 | 38 |
| 2020-03-03 | 19 | 57 |
| 2020-03-04 | 29 | 86 |
| 2020-03-05 | 31 | 117 |
| 2020-03-06 | 47 | 164 |
Weekly COVID 19 confirmed deaths average
deaths_eu_week <- new_deaths_eu_sum
deaths_eu_week$week_of_year <- week(new_deaths_eu_sum$date)
deaths_eu_week$year <- year(new_deaths_eu$date)
deaths_eu_week <- deaths_eu_week%>% group_by( week_of_year, year) %>% summarise(avg_deaths=sum(deaths)/7)
deaths_eu_week[50:54,]
`summarise()` has grouped output by 'week_of_year'. You can override using the `.groups` argument.
| week_of_year | year | avg_deaths |
|---|---|---|
| <dbl> | <dbl> | <dbl> |
| 13 | 2021 | 2466.8571 |
| 13 | 2022 | 1037.0000 |
| 13 | 2023 | 229.2857 |
| 14 | 2020 | 3296.2857 |
| 14 | 2021 | 2423.0000 |
#Calculate the week of the year and yearfor each date
#we start directly from new_deaths_eu_sum, which contains the date and the sum of deaths per day
deaths_eu_week <- new_deaths_eu_sum
deaths_eu_week$week_of_year <- week(new_deaths_eu_sum$date)
deaths_eu_week$year <- year(new_deaths_eu$date)
#group by week of the year and yearand average the deaths for each week
deaths_eu_week <- deaths_eu_week%>% group_by( week_of_year, year) %>% summarise(avg_deaths=sum(deaths)/7)
deaths_eu_week[1:4,]
#divide by year
deaths_eu_week_2021 <- deaths_eu_week%>% filter(year == 2021)
deaths_eu_week_2022 <- deaths_eu_week %>% filter(year == 2022)
deaths_eu_week_2023 <- deaths_eu_week %>% filter(year == 2023)
#plot
options(repr.plot.width = 15, repr.plot.height = 15)
plot2a <- ggplot(deaths_eu_week_2021, aes(x = week_of_year, y = avg_deaths)) +
geom_line(color = 'orchid4', size = 1.5) +
geom_point(color = 'darkorange', size = 4) +
labs(x = "Week of the year", y = "Average COVID-19 deaths",
title = "Average confirmed COVID-19 deaths in 2021", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot2b <- ggplot(deaths_eu_week_2022, aes(x = week_of_year, y = avg_deaths)) +
geom_line(color = 'orchid4', size = 1.5) +
geom_point(color = 'darkorange', size = 4) +
labs(x = "Week of the year", y = "Average COVID-19 deaths",
title = "Average confirmed COVID-19 deaths in 2022", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot2c <- ggplot(deaths_eu_week_2023, aes(x = week_of_year, y = avg_deaths)) +
geom_line(color = 'orchid4', size = 1.5) +
geom_point(color = 'darkorange', size = 4) +
labs(x = "Week of the year", y = "Average COVID-19 deaths",
title = "Average confirmed COVID-19 deaths in 2023", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
# Combine the three plots
library(gridExtra)
grid.arrange(plot2a, plot2b, plot2c, nrow = 3,
top = "Average confirmed COVID-19 deaths in 2021, 2022, and 2023")
`summarise()` has grouped output by 'week_of_year'. You can override using the `.groups` argument.
| week_of_year | year | avg_deaths |
|---|---|---|
| <dbl> | <dbl> | <dbl> |
| 1 | 2020 | 0.0000 |
| 1 | 2021 | 3120.5714 |
| 1 | 2022 | 1515.5714 |
| 1 | 2023 | 584.4286 |
It is interesting also to look at the excess mortality from 2020 to 2023.
# Set the file path
excess_mortality_path <- "excess_mortality.csv"
# Load the CSV file
excess_mortality <- read.csv(excess_mortality_path)
# Filter the data to select only the countries in Europe
excess_mortality_eu <- excess_mortality %>% filter(location %in% european_countries)
excess_mortality_eu <- excess_mortality_eu %>% filter (!is.na(cum_excess_proj_all_ages)) #delete the NA
#Calculate the week of the year for each date
excess_mortality_eu_week <- excess_mortality_eu
excess_mortality_eu_week$date <- as.Date(excess_mortality_eu_week$date, format = "%Y-%m-%d")
excess_mortality_eu_week$year <- year(excess_mortality_eu_week$date)
excess_mortality_eu_week$week <- week(excess_mortality_eu_week$date)
#group by year and week and compute the mean
excess_mortality_eu_week <- excess_mortality_eu_week%>% group_by( week, year ) %>% summarise(avg_deaths=sum(cum_excess_proj_all_ages)/7)
#divide by year
excess_mortality_eu_week_2020 <- excess_mortality_eu_week %>% filter(year == 2020)
excess_mortality_eu_week_2021 <- excess_mortality_eu_week %>% filter(year == 2021)
excess_mortality_eu_week_2022 <- excess_mortality_eu_week %>% filter(year == 2022)
excess_mortality_eu_week_2023 <- excess_mortality_eu_week %>% filter(year == 2023)
options(repr.plot.width = 15, repr.plot.height = 15)
plot4 <- ggplot(excess_mortality_eu_week_2020, aes(x = week, y = avg_deaths)) +
geom_line(color = 'darkgreen') +
geom_point(color = 'darkgreen') +
labs(x = "Week of the year", y = "Mean death", title = "Mean excess_mortality of death per week in 2020", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot5 <- ggplot(excess_mortality_eu_week_2021, aes(x = week, y = avg_deaths)) +
geom_line(color = 'darkgreen') +
geom_point(color = 'darkgreen') +
labs(x = "Week of the year", y = "Mean death", title = "Mean excess mortality per week in 2021", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot6 <- ggplot(excess_mortality_eu_week_2022, aes(x = week, y = avg_deaths)) +
geom_line(color = 'darkgreen') +
geom_point(color = 'darkgreen') +
labs(x = "Week of the year", y = "Mean death", title = "Mean excess mortality per week in 2022", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
plot7<- ggplot(excess_mortality_eu_week_2023, aes(x = week, y = avg_deaths)) +
geom_line(color = 'darkgreen') +
geom_point(color = 'darkgreen') +
labs(x = "Week of the year", y = "Mean death", title = "Mean excess mortality per week in 2023", color = "") +
theme(axis.title = element_text(size = 20),
axis.text = element_text(size = 15),
plot.title = element_text(size = 25))
# Create a grid of plots using the patchwork package
plot <- (plot4) / (plot5) / (plot6)/ (plot7)
plot
`summarise()` has grouped output by 'week'. You can override using the `.groups` argument.